cb_palette = c("#999999", "#E69F00", "#56B4E9", "#009E73",
"#F0E442", "#0072B2", "#D55E00", "#CC79A7")
packages = c('tidyverse',
'ggplot2',
'dplyr',
'tidyr',
'plotly',
'RColorBrewer'
)
for (p in packages){
library(p,character.only = T)
}
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──
## ✔ ggplot2 3.3.6 ✔ purrr 0.3.4
## ✔ tibble 3.1.8 ✔ dplyr 1.0.9
## ✔ tidyr 1.2.0 ✔ stringr 1.4.1
## ✔ readr 2.1.2 ✔ forcats 0.5.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
##
## Attaching package: 'plotly'
##
##
## The following object is masked from 'package:ggplot2':
##
## last_plot
##
##
## The following object is masked from 'package:stats':
##
## filter
##
##
## The following object is masked from 'package:graphics':
##
## layout
df_happiness = read.csv('World_Happiness_index(2005 - 2021).csv', header = TRUE)
df_unemployment = read.csv('Unemployment_Rate.csv', header = TRUE)
df_continents = read.table("continents.txt",sep = '', header = TRUE)
Filtering the dataset
df_happiness = df_happiness %>%
filter(year >= 2010)
df_unemployment = df_unemployment %>%
filter(Year >= 2010)
df_unemployment = df_unemployment %>%
rename(year = Year,
Country = Entity,
Unemployment_Rate = Unemployment..total....of.total.labor.force...modeled.ILO.estimate.,
)
df_happiness = df_happiness %>%
rename(
Country = Country.name,
GDP_percapita = Log.GDP.per.capita,
Life_expextancy = Healthy.life.expectancy.at.birth,
Corruption_rate = Perceptions.of.corruption
)
df_continents = df_continents %>%
rename(Country = name)
Final_df = merge(df_happiness,df_unemployment[,c("Country", "year", "Unemployment_Rate")],
by=c("Country", "year"))
Final_df = merge(Final_df, df_continents[,c("Country", "continent")], by = c("Country"))
Final_df = Final_df[,c("Country", "continent", "year", "Life.Ladder", "GDP_percapita", "Life_expextancy", "Unemployment_Rate")]
Final_df = na.omit(Final_df)
length(unique(Final_df$Country))
## [1] 148
final_mean=
Final_df %>%
merge(df_continents[,c("Country", "continent")], by = c("Country")) %>%
group_by(Country) %>%
summarise(mean_happiness = mean(Life.Ladder), mean_unemployment = mean(Unemployment_Rate), mean_gdp=mean(GDP_percapita),mean_life= mean(Life_expextancy))
library(rworldmap)
## Loading required package: sp
## ### Welcome to rworldmap ###
## For a short introduction type : vignette('rworldmap')
ddf <- subset(final_mean[c("Country","mean_happiness")])
class(ddf)
## [1] "tbl_df" "tbl" "data.frame"
sPDF <- joinCountryData2Map( ddf
,joinCode = "NAME"
,nameJoinColumn = "Country")
## 148 codes from your data successfully matched countries in the map
## 0 codes from your data failed to match with a country code in the map
## 95 codes from the map weren't represented in your data
library(RColorBrewer)
colours=brewer.pal(4,"Spectral")
mapCountryData(sPDF, nameColumnToPlot="mean_happiness", catMethod="fixedWidth",colourPalette = colours)
## Warning in rwmGetColours(colourPalette, numColours): 4 colours specified and 7
## required, using interpolation to calculate colours
v_color <- viridis::viridis(
n = nrow(Final_df)
)
Final_df$color <- v_color[Matrix::invPerm(
p = order(
x = Final_df$Life.Ladder
)
)]
##Correlation of Happiness score and three dependent variables.
pairs(
formula = Life.Ladder ~ GDP_percapita + Life_expextancy +
Unemployment_Rate,
data = Final_df,
col = Final_df$color
,pch = 19
)
final_mean_df= Final_df %>%
group_by(Country) %>%
summarise(mean_happiness = mean(Life.Ladder), mean_unemployment = mean(Unemployment_Rate), mean_gdp=mean(GDP_percapita),mean_life= mean(Life_expextancy))%>%
merge(df_continents[,c("Country", "continent")], by = c("Country"))
final_mean_df=as.data.frame(final_mean_df)
ggplot(final_mean_df , aes(x = continent, y = mean_happiness)) +
geom_boxplot(aes(fill=continent)) + theme_bw() +
theme(axis.text.x = element_text (angle = 90))+theme_dark() +
labs(y = "Average Happiness Score",
x = "Continents",
title = "Happiness with continents")
final_mean2=
Final_df %>%
group_by(Country) %>%
filter(year < 2019) %>%
summarise(mean_happiness = mean(Life.Ladder), mean_unemployment = mean(Unemployment_Rate), mean_gdp=mean(GDP_percapita),mean_life= mean(Life_expextancy))
final_mean3=merge(final_mean2,df_continents, by="Country")
df <-final_mean3[order(final_mean3$mean_happiness),]
final_mean3.top10 = tail(df,10)
ggplot(final_mean3.top10, aes(x = reorder(Country,-mean_happiness), y = mean_happiness,fill=continent)) + geom_bar(stat="identity")+ theme(axis.text.x = element_text(angle = 45))+geom_col(colour = "black") + coord_cartesian(ylim=c(4,8))+scale_fill_manual(values = c("Asia"="#999999", "Europe"="#009E73", "Oceania"="#E69F00", "Americas"="#0072B2")) + labs(y = "Happiness Index",
x = "Country",
title = "Top 10 Happiest countries before COVID")
final_mean4=
Final_df %>%
group_by(Country) %>%
filter(year > 2019) %>%
summarise(mean_happiness = mean(Life.Ladder), mean_unemployment = mean(Unemployment_Rate), mean_gdp=mean(GDP_percapita),mean_life= mean(Life_expextancy))
final_mean5=merge(final_mean4,df_continents, by="Country")
df2 <-final_mean5[order(final_mean5$mean_happiness),]
final_mean5.top10 = tail(df2,10)
ggplot(final_mean5.top10, aes(x = reorder(Country,-mean_happiness), y = mean_happiness,fill=continent)) + geom_bar(stat="identity")+ theme(axis.text.x = element_text(angle = 45))+geom_col(colour = "black")+coord_cartesian(ylim=c(4,8))+scale_fill_manual(values = c("Asia"="#999999", "Europe"="#009E73", "Oceania"="#E69F00", "Americas"="#0072B2"))+ labs(y = "Happiness Index",
x = "Country",
title = "Top 10 Happiest countries during COVID")
Final_df %>%
group_by(Country) %>%
filter(year < 2019) %>%
summarise(mean_happiness = mean(Life.Ladder), mean_unemployment = mean(Unemployment_Rate), mean_gdp=mean(GDP_percapita),mean_life= mean(Life_expextancy))%>%
merge(df_continents[,c("Country", "continent")], by = c("Country")) %>%
ggplot(aes(x=mean_unemployment,y=mean_happiness,size=mean_gdp,col=continent))+geom_jitter(alpha=0.6)+ scale_color_manual(values = cb_palette)+geom_smooth(method="lm", span =0.8,se=FALSE) +
labs(y = "Happiness Index",
x = "Unemployment Rate",
title = "Happines vs Unemployment with GDP as third variable")
## `geom_smooth()` using formula 'y ~ x'
Final_df$yearfactor = factor(Final_df$year)
Final_df %>%
group_by(yearfactor) %>%
summarise(mean_happiness = mean(Life.Ladder)) %>%
ggplot(aes(x=yearfactor, y = mean_happiness)) + geom_point() + geom_line(aes(group=1))
## Time plot of how the Happiness varies over the years for each
continent
Final_df %>%
group_by(continent, year) %>%
summarise(mean_happiness = mean(Life.Ladder)) %>%
ggplot(aes(x=year, y = mean_happiness, color = continent)) + geom_point() + geom_line() + scale_color_manual(values = cb_palette)+labs(y = "Average Happiness Index", x = "Year",
title = "Chnage of Happiness over the years")
## `summarise()` has grouped output by 'continent'. You can override using the
## `.groups` argument.
Final_df %>%
#summarise(mean_happiness = mean(Life.Ladder), mean_unemployment = mean(Unemployment_Rate)) %>%
ggplot(aes(x = Unemployment_Rate, y = Life.Ladder)) + geom_point() + geom_smooth(method = 'lm') + labs(y = "Average Happiness Index", x = "Avg Unemployment rate",
title = "Avg Happiness index against Unemployment Rate")
## `geom_smooth()` using formula 'y ~ x'
Final_df %>%
#summarise(mean_happiness = mean(Life.Ladder), mean_unemployment = mean(Unemployment_Rate)) %>%
ggplot(aes(x = Life_expextancy, y = Life.Ladder)) + geom_point() + geom_smooth(method = 'lm') + labs(y = "Average Happiness Index", x = "Avg Life expectaancy",
title = "Avg Happiness index against Avg Life expectancy")
## `geom_smooth()` using formula 'y ~ x'
## Happiness Index vs GDP percapita
Final_df %>%
#summarise(mean_happiness = mean(Life.Ladder), mean_unemployment = mean(Unemployment_Rate)) %>%
ggplot(aes(x = GDP_percapita, y = Life.Ladder)) + geom_point() + geom_smooth(method = 'lm') + labs(y = "Average Happiness Index", x = "Avg GDP percapita",
title = "Avg Happiness index against Avg GDP percapita")
## `geom_smooth()` using formula 'y ~ x'
Final_df_2010_2018 = filter(Final_df, year <= '2018')
Final_df_2018_2021 = filter(Final_df, year > '2018')
Final_df %>%
group_by(continent, Country) %>%
#filter(continent != 'Ocenia') %>%
summarise(mean_happiness = mean(Life.Ladder), mean_unemployment = mean(Unemployment_Rate)) %>%
ggplot(aes(x = mean_unemployment, y = mean_happiness, color = continent)) + geom_point() + geom_smooth(method = 'lm', se = FALSE) + scale_color_manual(values = cb_palette) + labs(y = "Average Happiness Index", x = "Average Unemployment Rate",
title = "Average Happiness index against Unemployment Rate grouped by Continents")
## `summarise()` has grouped output by 'continent'. You can override using the
## `.groups` argument.
## `geom_smooth()` using formula 'y ~ x'
## Trend of the Life expectancy on Happiness Index grouped by
continents
Final_df %>%
group_by(continent, Country) %>%
summarise(mean_happiness = mean(Life.Ladder), mean_lifeexpectancy = mean(Life_expextancy)) %>%
ggplot(aes(x = mean_lifeexpectancy, y = mean_happiness, color = continent)) + geom_point() + geom_smooth(method = 'lm', se = FALSE) + scale_color_manual(values = cb_palette) + labs(y = "Average Happiness Index", x = "Average Life expectancy",
title = "Average Happiness index against Average Life expectancy grouped by Continents")
## `summarise()` has grouped output by 'continent'. You can override using the
## `.groups` argument.
## `geom_smooth()` using formula 'y ~ x'
## Trend of the GDP percapita on Happiness Index grouped by
continents
Final_df %>%
group_by(continent, Country) %>%
summarise(mean_happiness = mean(Life.Ladder), mean_gdppercapita = mean(GDP_percapita)) %>%
ggplot(aes(x = mean_gdppercapita, y = mean_happiness, color = continent)) + geom_point() + geom_smooth(method = 'lm', se = FALSE) + scale_color_manual(values = cb_palette)+ labs(y = "Happiness Index", x = "GDP percapita",
title = "Happines vs GDP")
## `summarise()` has grouped output by 'continent'. You can override using the
## `.groups` argument.
## `geom_smooth()` using formula 'y ~ x'
lm(Life.Ladder ~ Unemployment_Rate, data = Final_df_2010_2018)
##
## Call:
## lm(formula = Life.Ladder ~ Unemployment_Rate, data = Final_df_2010_2018)
##
## Coefficients:
## (Intercept) Unemployment_Rate
## 5.65240 -0.02641
lm(Life.Ladder ~ Unemployment_Rate, data = Final_df_2018_2021)
##
## Call:
## lm(formula = Life.Ladder ~ Unemployment_Rate, data = Final_df_2018_2021)
##
## Coefficients:
## (Intercept) Unemployment_Rate
## 5.93290 -0.03915
lm(Life.Ladder ~ Life_expextancy, data = Final_df_2010_2018)
##
## Call:
## lm(formula = Life.Ladder ~ Life_expextancy, data = Final_df_2010_2018)
##
## Coefficients:
## (Intercept) Life_expextancy
## -2.3834 0.1237
lm(Life.Ladder ~ Life_expextancy, data = Final_df_2018_2021)
##
## Call:
## lm(formula = Life.Ladder ~ Life_expextancy, data = Final_df_2018_2021)
##
## Coefficients:
## (Intercept) Life_expextancy
## -3.600 0.142
lm(Life.Ladder ~ GDP_percapita, data = Final_df_2010_2018)
##
## Call:
## lm(formula = Life.Ladder ~ GDP_percapita, data = Final_df_2010_2018)
##
## Coefficients:
## (Intercept) GDP_percapita
## -1.8662 0.7794
lm(Life.Ladder ~ GDP_percapita, data = Final_df_2018_2021)
##
## Call:
## lm(formula = Life.Ladder ~ GDP_percapita, data = Final_df_2018_2021)
##
## Coefficients:
## (Intercept) GDP_percapita
## -1.8563 0.7852
temp = Final_df_2010_2018%>%
group_by(Country) %>%
summarise(mean_happiness = mean(Life.Ladder), mean_unemployment = mean(Unemployment_Rate)) %>%
mutate(group.year = "2010-2018")
temp_1 = Final_df_2018_2021%>%
group_by(Country) %>%
summarise(mean_happiness = mean(Life.Ladder), mean_unemployment = mean(Unemployment_Rate)) %>%
mutate(group.year = "2019-2021")
full_temp = union_all(temp, temp_1)
ggplot(full_temp, aes(x = mean_unemployment, y = mean_happiness, color = group.year)) + geom_point() + geom_smooth(method="lm", span =0.8, se = FALSE) + scale_color_manual(values = cb_palette) + labs(y = "Average Happiness Index", x = "Average Unemployment Rate",
title = "Avg Happiness index against Avg Unemployment Rate before and during covid")
## `geom_smooth()` using formula 'y ~ x'
temp = Final_df_2010_2018 %>%
group_by(Country) %>%
summarise(mean_happiness = mean(Life.Ladder), mean_lifeexpectancy = mean(Life_expextancy)) %>%
mutate(group.year = "2010-2018")
temp_1 = Final_df_2018_2021 %>%
group_by(Country) %>%
summarise(mean_happiness = mean(Life.Ladder), mean_lifeexpectancy = mean(Life_expextancy)) %>%
mutate(group.year = "2019-2021")
full_temp = union_all(temp, temp_1)
ggplot(full_temp, aes(x = mean_lifeexpectancy, y = mean_happiness, color = group.year)) + geom_point() + geom_smooth(method="lm", span =0.8, se = FALSE) + scale_color_manual(values = cb_palette) + labs(y = "Average Happiness Index", x = "Average Life expectancy",
title = "Avg Happiness index against Avg Life expectancy before and during covid")
## `geom_smooth()` using formula 'y ~ x'
temp = Final_df_2010_2018 %>%
group_by(Country) %>%
summarise(mean_happiness = mean(Life.Ladder), mean_gdppercapita = mean(GDP_percapita)) %>%
mutate(group.year = "2010-2018")
temp_1 = Final_df_2018_2021 %>%
group_by(Country) %>%
summarise(mean_happiness = mean(Life.Ladder), mean_gdppercapita = mean(GDP_percapita)) %>%
mutate(group.year = "2019-2021")
full_temp = union_all(temp, temp_1)
ggplot(full_temp, aes(x = mean_gdppercapita, y = mean_happiness, color = group.year)) + geom_point() + geom_smooth(method="lm", span =0.8, se = FALSE) + scale_color_manual(values = cb_palette) + labs(y = "Average Happiness Index", x = "Average GDP percapita", title = "Avg Happiness index against Avg GDP percapita before and during covid")
## `geom_smooth()` using formula 'y ~ x'
## Predicting Happiness Index of Happiness Index during covid by
training lm model on data during covid
library(mgcv)
## Loading required package: nlme
##
## Attaching package: 'nlme'
## The following object is masked from 'package:dplyr':
##
## collapse
## This is mgcv 1.8-40. For overview type 'help("mgcv-package")'.
happiness_lm = lm(Life.Ladder ~ Life_expextancy + GDP_percapita + Unemployment_Rate, data = Final_df_2010_2018, familiy = 'symmetric')
## Warning: In lm.fit(x, y, offset = offset, singular.ok = singular.ok, ...) :
## extra argument 'familiy' will be disregarded
plot(happiness_lm)
happiness_grid = Final_df_2018_2021[,c("Life_expextancy", "GDP_percapita", "Unemployment_Rate")]
happiness.predict = predict(happiness_lm, newdata = happiness_grid)
Final_df_2018_2021$predicted.happiness_lm = happiness.predict
Final_df_2018_2021 %>%
group_by(Country) %>%
summarise(mean_happiness.predicted = mean(predicted.happiness_lm), mean_happiness.original = mean(Life.Ladder)) %>%
merge(df_continents, by = c("Country")) %>%
ggplot( aes(x = mean_happiness.predicted, y = mean_happiness.original, color = continent)) + geom_point() + geom_abline(slope = 1, intercept = 0) +geom_smooth(method="lm", span =0.8, se = FALSE) + scale_color_manual(values = cb_palette)+labs(x = "Predicted Happiness score",
y = "Actual Happiness Score",
title = "Prediction of Happiness (lm model)")
## `geom_smooth()` using formula 'y ~ x'
mean((Final_df_2018_2021$Life.Ladder - Final_df_2018_2021$predicted.happiness_lm)^2)
## [1] 0.42365
happy_gam = gam(Life.Ladder ~s(Life_expextancy) + s(GDP_percapita) + s(Unemployment_Rate), data = Final_df_2010_2018)
grid_gam = Final_df_2018_2021[,c("Life_expextancy", "GDP_percapita", "Unemployment_Rate")]
gam.predict = predict(happy_gam, newdata = grid_gam)
Final_df_2018_2021$predicted.happy_gam = gam.predict
Final_df_2018_2021 %>%
group_by(Country) %>%
summarise(mean_happiness.predicted = mean(predicted.happy_gam), mean_happiness.original = mean(Life.Ladder) ) %>%
merge(df_continents, by = c("Country")) %>%
ggplot( aes(x = mean_happiness.predicted, y = mean_happiness.original, color = continent)) + geom_point() + geom_abline(slope = 1, intercept = 0) + geom_smooth(method = 'lm',se=FALSE) + scale_color_manual(values = cb_palette)+labs(x = "Predicted Happiness Score ",
y = "Actual Happiness Score",
title = "Prediction of Happiness (GAM model)")
## `geom_smooth()` using formula 'y ~ x'
## Mean squared error for lm model
mean((Final_df_2018_2021$Life.Ladder - Final_df_2018_2021$predicted.happy_gam)^2)
## [1] 0.3523843
timeseries_df = Final_df %>%
group_by(year) %>%
summarise(mean_happiness = mean(Life.Ladder)) %>%
filter(year < 2019)
timeseries_object = ts(timeseries_df[,2], start = (2010), end = (2018), frequency = 1)
timeseries_object
## Time Series:
## Start = 2010
## End = 2018
## Frequency = 1
## mean_happiness
## [1,] 5.510562
## [2,] 5.450370
## [3,] 5.465649
## [4,] 5.402891
## [5,] 5.417631
## [6,] 5.427111
## [7,] 5.420227
## [8,] 5.481281
## [9,] 5.494350
library(forecast)
## Registered S3 method overwritten by 'quantmod':
## method from
## as.zoo.data.frame zoo
##
## Attaching package: 'forecast'
## The following object is masked from 'package:nlme':
##
## getResponse
library(fpp2)
## ── Attaching packages ────────────────────────────────────────────── fpp2 2.4 ──
## ✔ fma 2.4 ✔ expsmooth 2.3
## ── Conflicts ───────────────────────────────────────────────── fpp2_conflicts ──
## ✖ forecast::getResponse() masks nlme::getResponse()
library(TTR)
library(dplyr)
holt_pred = holt(timeseries_object, h = 3)
summary(holt_pred)
##
## Forecast method: Holt's method
##
## Model Information:
## Holt's method
##
## Call:
## holt(y = timeseries_object, h = 3)
##
## Smoothing parameters:
## alpha = 0.852
## beta = 1e-04
##
## Initial states:
## l = 5.5018
## b = -8e-04
##
## sigma: 0.0483
##
## AIC AICc BIC
## -30.05459 -10.05459 -29.06847
##
## Error measures:
## ME RMSE MAE MPE MAPE
## Training set -0.0004676697 0.03601328 0.02691426 -0.01087433 0.4937918
## MASE ACF1
## Training set 0.8844016 -0.06015114
##
## Forecasts:
## Point Forecast Lo 80 Hi 80 Lo 95 Hi 95
## 2019 5.490150 5.428230 5.552071 5.395451 5.584850
## 2020 5.489345 5.407994 5.570696 5.364929 5.613761
## 2021 5.488540 5.391573 5.585507 5.340241 5.636838
holt_pred_df = as.data.frame(holt_pred)
year = c(2019, 2020, 2021)
mean_happiness = holt_pred_df$`Point Forecast`
timeseries_df1 = data.frame(year, mean_happiness)
timeseries_df = union_all(timeseries_df, timeseries_df1)
timeseries_df = timeseries_df %>%
rename(happiness.forecasted = mean_happiness)
tempdf = Final_df %>%
group_by(year) %>%
summarise(mean_happiness = mean(Life.Ladder))
timeseries_df$original.happiness = tempdf$mean_happiness
timeseries_df = timeseries_df %>%
pivot_longer(cols = c(happiness.forecasted, original.happiness),
names_to = 'happiness.type',
values_to = 'happinness.value')
ggplot(timeseries_df, aes(x = year, y = happinness.value , color = happiness.type)) + geom_point() + geom_line() + scale_color_manual(values = cb_palette) + labs(x = "Year",
y = "Happiness Index",
title = "Holts forecasting")